
/* This code reads in 4 excel files that were exported from the IPEDS Access database "IPEDS202223", which was stored in IPEDS-2022_23_Provisional.zip, 
downloaded from https://nces.ed.gov/ipeds/use-the-data/download-access-database, on 4/12/2024. 

This code also reads in a text file that was created by copying and pasting the full ranking (434 institutions) of the US News and World Report Social Mobility Rankings for national universities: https://premium.usnews.com/best-colleges/rankings/national-universities/social-mobility.

It runs on Stata 17, and requires that HD2022, SFA2122_P1, DRVIC2022, SAL2022_IS from IPEDS, and the text file USNEWs.txt be located in the folder indicated in the "cd" statement.


*/ 

* THIS MUST BE CHANGED TO WHERE ALL RAW DATA IS LOCATED
cd "D:\Internal External Disparities Op Ed"



* Import Header data (eg. school name, type)

#delimit cr

import excel HD2022, first clear

save IPEDSHeader.dta, replace

* Import financial aid data

import excel SFA2122_P1, first clear

rename SCFA1N total_fall_cohort
rename SCFA12P pct_instate
rename SCFA13P pct_outofstate
rename PGRNT_P pct_ftftpell
rename UPGRNTP pct_pell

keep UNITID total_fall_cohort pct_instate pct_outofstate pct_ftftpell pct_pell

save FinancialAid.dta, replace


* Import cost data
import excel DRVIC2022, first clear

rename CINSON instate_cost
rename COTSON outofstate_cost
rename unitid UNITID
keep UNITID instate_cost outofstate_cost

*note that there is incomplete information here
merge 1:1 UNITID using FinancialAid.dta

drop _merge

*calculate the total cost of attendance, which is in-state cost times the percent of students who are in state (goes from 0-100, so divide by 100) plus the out-of-state-cost times the percent of students who are out-of state (goes from 0-11, so divide by 100)

gen ln_total_cost = ln( ((pct_instate/100)*instate_cost) + ((pct_outofstate/100)*outofstate_cost) )
replace ln_total_cost=ln(instate_cost) if missing(pct_outofstate)

gen total_cost = ((pct_instate/100)*instate_cost) + ((pct_outofstate/100)*outofstate_cost)
replace total_cost=instate_cost if missing(pct_outofstate)

compress
save IPEDS_cost.dta, replace


* Import pay by gender and rank

import excel SAL2022_IS, first clear

merge m:1 UNITID using IPEDSHeader.dta

drop if _merge==2

drop _merge


merge m:1 UNITID using IPEDS_cost.dta

drop if _merge==2

drop _merge

 *calculate the difference in salary - the extra paid to men as a percent of women's pay
gen MFGAP=100*(SAEQ9AM-SAEQ9AW)/SAEQ9AW


#delimit ;

label define rank
1 "Professor"
2 "Associate professor"
3 "Assistant professor"
4 "Instructor"
5 "Lecturer"
6 "No academic rank"
7 "All instructional staff total";
 

label value ARANK rank;

* Just keep tenured and tenure track faculty: 1 = Full, 2 = Associate, 3 = Assistant;

keep if ARANK<4;

#delimit cr
*Some clean up to match US News and World Report ratings

replace INSTNM="University of St. Thomas (TX)" if INSTNM=="University of St Thomas" & CITY=="Houston"
replace INSTNM="Marian University (IN)" if INSTNM=="Marian University" & CITY=="Indianapolis"
replace INSTNM="University of St. Thomas (MN)" if INSTNM=="University of St Thomas" & CITY=="Saint Paul"	

save IPEDS.dta, replace

* Read in this text file that was a "select all" and paste from US News and World Report Website.
insheet using USNews.txt, clear

replace v1=v2 if v1==""
gen x=v1==""
drop if v1=="ADVERTISING" | v1=="ADVERTISEMENT"
gen y=sum(x)
drop if x==1
by y, sort: gen line=_n

gen INSTNAM1=v1 if line==1
gen city1=v1 if line==2
gen rank1=v1 if line==3

by y, sort: egen INSTNM=mode(INSTNAM1)
by y, sort: egen city=mode(city1)
by y, sort: egen rank=mode(rank1)

keep if line==1

keep INSTNM city rank

duplicates drop
destring rank, gen(mobility_rank) ignore("#")

* Clean up some institution names to match how they are recorded in IPEDS 
replace INSTNM="Arizona State University Campus Immersion" if INSTNM=="Arizona State University"
replace INSTNM="Bethel University" if INSTNM=="Bethel University (MN)"
replace INSTNM="Binghamton University" if INSTNM=="Binghamton University--SUNY"
replace INSTNM="Bowling Green State University-Main Campus" if INSTNM=="Bowling Green State University"
replace INSTNM="CUNY City College" if INSTNM=="CUNY--City College"
replace INSTNM="California State University-Fresno" if INSTNM=="California State University, Fresno"
replace INSTNM="California State University-East Bay" if INSTNM=="California State University--East Bay"
replace INSTNM="California State University-Fullerton" if INSTNM=="California State University--Fullerton"
replace INSTNM="California State University-Long Beach" if INSTNM=="California State University--Long Beach"
replace INSTNM="California State University-San Bernardino" if INSTNM=="California State University--San Bernardino"
replace INSTNM="Colorado State University-Fort Collins" if INSTNM=="Colorado State University"
replace INSTNM="Colorado Technical University-Colorado Springs" if INSTNM=="Colorado Technical University"
replace INSTNM="Columbia University in the City of New York" if INSTNM=="Columbia University"
replace INSTNM="Concordia University-Irvine" if INSTNM=="Concordia University -- Irvine"
replace INSTNM="Concordia University-Wisconsin" if INSTNM=="Concordia University Wisconsin"
replace INSTNM="D'Youville  University" if INSTNM=="D'Youville University"
replace INSTNM="Florida Agricultural and Mechanical University" if INSTNM=="Florida A&M University"
replace INSTNM="Georgia Institute of Technology-Main Campus" if INSTNM=="Georgia Institute of Technology"
replace INSTNM="Indiana University of Pennsylvania-Main Campus" if INSTNM=="Indiana University of Pennsylvania"
replace INSTNM="Indiana University-Bloomington" if INSTNM=="Indiana University--Bloomington"
replace INSTNM="Indiana University-Purdue University-Indianapolis" if INSTNM=="Indiana University-Purdue University--Indianapolis"
replace INSTNM="Iowa State University" if INSTNM=="Iowa State University of Science and Technology"
replace INSTNM="Keiser University-Ft Lauderdale" if INSTNM=="Keiser University"
replace INSTNM="Kent State University at Kent" if INSTNM=="Kent State University"
replace INSTNM="Louisiana State University and Agricultural & Mechanical College" if INSTNM=="Louisiana State University--Baton Rouge"
replace INSTNM="Maryville University of Saint Louis" if INSTNM=="Maryville University of St. Louis"
replace INSTNM="Miami University-Oxford" if INSTNM=="Miami University--Oxford"
replace INSTNM="Missouri State University-Springfield" if INSTNM=="Missouri State University"
replace INSTNM="Mount Saint Joseph University" if INSTNM=="Mount St. Joseph University"
replace INSTNM="New Mexico State University-Main Campus" if INSTNM=="New Mexico State University"
replace INSTNM="North Carolina A & T State University" if INSTNM=="North Carolina Agricultural and Technical State University"
replace INSTNM="North Carolina State University at Raleigh" if INSTNM=="North Carolina State University"
replace INSTNM="North Dakota State University-Main Campus" if INSTNM=="North Dakota State University"
replace INSTNM="Ohio University-Main Campus" if INSTNM=="Ohio University"
replace INSTNM="Oklahoma State University-Main Campus" if INSTNM=="Oklahoma State University"
replace INSTNM="Pontifical Catholic University of Puerto Rico-Ponce" if INSTNM=="Pontifical Catholic University of Puerto Rico--Ponce"
replace INSTNM="Prairie View A & M University" if INSTNM=="Prairie View A&M University"
replace INSTNM="Purdue University-Main Campus" if INSTNM=="Purdue University--Main Campus"
replace INSTNM="Rutgers University-Camden" if INSTNM=="Rutgers University--Camden"
replace INSTNM="Rutgers University-New Brunswick" if INSTNM=="Rutgers University--New Brunswick"
replace INSTNM="Rutgers University-Newark" if INSTNM=="Rutgers University--Newark"
replace INSTNM="Southern Illinois University-Edwardsville" if INSTNM=="Southern Illinois University Edwardsville"
replace INSTNM="Southern Illinois University-Carbondale" if INSTNM=="Southern Illinois University--Carbondale"
replace INSTNM="Southern University and A & M College" if INSTNM=="Southern University and A&M College"
replace INSTNM="Saint Ambrose University" if INSTNM=="St. Ambrose University"
replace INSTNM="St Catherine University" if INSTNM=="St. Catherine University"
replace INSTNM="St. John's University-New York" if INSTNM=="St. John's University (NY)"
replace INSTNM="Stony Brook University" if INSTNM=="Stony Brook University--SUNY"
replace INSTNM="Tennessee Technological University" if INSTNM=="Tennessee Tech University"
replace INSTNM="Texas A & M University-College Station" if INSTNM=="Texas A&M University"
replace INSTNM="Texas A & M University-Commerce" if INSTNM=="Texas A&M University--Commerce"
replace INSTNM="Texas A & M University-Corpus Christi" if INSTNM=="Texas A&M University--Corpus Christi"
replace INSTNM="Texas A & M University-Kingsville" if INSTNM=="Texas A&M University--Kingsville"
replace INSTNM="The College of Saint Scholastica" if INSTNM=="The College of St. Scholastica"
replace INSTNM="Ohio State University-Main Campus" if INSTNM=="The Ohio State University"
replace INSTNM="Pennsylvania State University-Main Campus" if INSTNM=="The Pennsylvania State University--University Park"
replace INSTNM="University of Akron Main Campus" if INSTNM=="The University of Akron"
replace INSTNM="Trinity International University-Illinois" if INSTNM=="Trinity International University"
replace INSTNM="Tulane University of Louisiana" if INSTNM=="Tulane University"
replace INSTNM="Universidad Ana G. Mendez-Gurabo Campus" if INSTNM=="Universidad Ana G. Mendez--Gurabo Campus"
replace INSTNM="SUNY at Albany" if INSTNM=="University at Albany--SUNY"
replace INSTNM="University at Buffalo" if INSTNM=="University at Buffalo--SUNY"
replace INSTNM="University of Alabama in Huntsville" if INSTNM=="University of Alabama at Huntsville"
replace INSTNM="University of Alaska Fairbanks" if INSTNM=="University of Alaska--Fairbanks"
replace INSTNM="University of California-Berkeley" if INSTNM=="University of California, Berkeley"
replace INSTNM="University of California-Davis" if INSTNM=="University of California, Davis"
replace INSTNM="University of California-Irvine" if INSTNM=="University of California, Irvine"
replace INSTNM="University of California-Los Angeles" if INSTNM=="University of California, Los Angeles"
replace INSTNM="University of California-Merced" if INSTNM=="University of California, Merced"
replace INSTNM="University of California-Riverside" if INSTNM=="University of California, Riverside"
replace INSTNM="University of California-San Diego" if INSTNM=="University of California, San Diego"
replace INSTNM="University of California-Santa Barbara" if INSTNM=="University of California, Santa Barbara"
replace INSTNM="University of California-Santa Cruz" if INSTNM=="University of California, Santa Cruz"
replace INSTNM="University of Cincinnati-Main Campus" if INSTNM=="University of Cincinnati"
replace INSTNM="University of Colorado Denver/Anschutz Medical Campus" if INSTNM=="University of Colorado Denver"
replace INSTNM="The University of Findlay" if INSTNM=="University of Findlay"
replace INSTNM="University of Houston-Clear Lake" if INSTNM=="University of Houston--Clear Lake"
replace INSTNM="University of Illinois Chicago" if INSTNM=="University of Illinois--Chicago"
replace INSTNM="University of Maryland-Baltimore County" if INSTNM=="University of Maryland, Baltimore County"
replace INSTNM="University of Maryland-College Park" if INSTNM=="University of Maryland, College Park"
replace INSTNM="University of Maryland Eastern Shore" if INSTNM=="University of Maryland, Eastern Shore"
replace INSTNM="University of Massachusetts-Amherst" if INSTNM=="University of Massachusetts--Amherst"
replace INSTNM="University of Massachusetts-Boston" if INSTNM=="University of Massachusetts--Boston"
replace INSTNM="University of Massachusetts-Dartmouth" if INSTNM=="University of Massachusetts--Dartmouth"
replace INSTNM="University of Massachusetts-Lowell" if INSTNM=="University of Massachusetts--Lowell"
replace INSTNM="University of Michigan-Ann Arbor" if INSTNM=="University of Michigan--Ann Arbor"
replace INSTNM="University of Michigan-Flint" if INSTNM=="University of Michigan--Flint"
replace INSTNM="University of Minnesota-Twin Cities" if INSTNM=="University of Minnesota, Twin Cities"
replace INSTNM="University of Missouri-Columbia" if INSTNM=="University of Missouri"
replace INSTNM="University of Missouri-Kansas City" if INSTNM=="University of Missouri--Kansas City"
replace INSTNM="University of Missouri-St Louis" if INSTNM=="University of Missouri--St. Louis"
replace INSTNM="The University of Montana" if INSTNM=="University of Montana"
replace INSTNM="University of Nebraska-Lincoln" if INSTNM=="University of Nebraska -- Lincoln"
replace INSTNM="University of Nebraska at Omaha" if INSTNM=="University of Nebraska Omaha"
replace INSTNM="University of Nevada-Las Vegas" if INSTNM=="University of Nevada, Las Vegas"
replace INSTNM="University of Nevada-Reno" if INSTNM=="University of Nevada, Reno"
replace INSTNM="University of New Hampshire-Main Campus" if INSTNM=="University of New Hampshire"
replace INSTNM="University of New Mexico-Main Campus" if INSTNM=="University of New Mexico"
replace INSTNM="University of North Carolina at Charlotte" if INSTNM=="University of North Carolina--Charlotte"
replace INSTNM="University of North Carolina Wilmington" if INSTNM=="University of North Carolina--Wilmington"
replace INSTNM="University of Oklahoma-Norman Campus" if INSTNM=="University of Oklahoma"
replace INSTNM="University of Pittsburgh-Pittsburgh Campus" if INSTNM=="University of Pittsburgh"
replace INSTNM="University of Puerto Rico-Rio Piedras" if INSTNM=="University of Puerto Rico--Rio Piedras"
replace INSTNM="University of South Carolina-Columbia" if INSTNM=="University of South Carolina"
replace INSTNM="University of St Francis" if INSTNM=="University of St. Francis"
replace INSTNM="The University of Tennessee-Chattanooga" if INSTNM=="University of Tennessee at Chattanooga"
replace INSTNM="The University of Tennessee-Knoxville" if INSTNM=="University of Tennessee, Knoxville"
replace INSTNM="The University of Texas Rio Grande Valley" if INSTNM=="University of Texas Rio Grande Valley"
replace INSTNM="The University of Texas at Austin" if INSTNM=="University of Texas at Austin"
replace INSTNM="The University of Texas at San Antonio" if INSTNM=="University of Texas at San Antonio"
replace INSTNM="University of Virginia-Main Campus" if INSTNM=="University of Virginia"
replace INSTNM="University of Washington-Seattle Campus" if INSTNM=="University of Washington"
replace INSTNM="University of Wisconsin-La Crosse" if INSTNM=="University of Wisconsin--La Crosse"
replace INSTNM="University of Wisconsin-Madison" if INSTNM=="University of Wisconsin--Madison"
replace INSTNM="University of Wisconsin-Milwaukee" if INSTNM=="University of Wisconsin--Milwaukee"
replace INSTNM="University of Wisconsin-Oshkosh" if INSTNM=="University of Wisconsin--Oshkosh"
replace INSTNM="Virginia Polytechnic Institute and State University" if INSTNM=="Virginia Tech"
replace INSTNM="Washington University in St Louis" if INSTNM=="Washington University in St. Louis"
replace INSTNM="Wright State University-Main Campus" if INSTNM=="Wright State University"

* Create a mobility "score" so that  higher number is "better" - more mobility with a higher score
qui sum mobility_rank

gen mobility_score=r(max)-mobility_rank


save  USNews.dta, replace

merge 1:m INSTNM using IPEDS.dta

keep if _merge==3

* Keep just 4 year public and private 

keep if (SECTOR==1 | SECTOR==2)
gen public=SECTOR==1

gen UC=index(INSTNM, "University of California")>0 & public==1 

* get rid of graduate schools that are listed as public 4 year - note these are identified by having missing undergrad cost information

drop if missing(total_cost)

unique UNITID


* Divide schools into groups, pull out values for graph labels

gen total_pct=0
gen total_pct_type=0

forvalues rank=1(1)3{

sum MFGAP if ARANK==`rank', detail

*Divide sample into 9 groups (so there is a "middle" group), and identify the biggest and smallest gaps in each group

_pctile MFGAP if ARANK==`rank', nq(9)

 
forvalues n=1(1)8 {
replace total_pct=`n' if MFGAP>r(r`n')	& ARANK==`rank'
replace total_pct=. if missing(MFGAP)



	}

	* Do this for public and private seperately
foreach type in 0 1 {
	
sum MFGAP if ARANK==`rank' & public==`type', detail


_pctile MFGAP if ARANK==`rank' & public==`type', nq(9)

 
forvalues n=1(1)8 {
replace total_pct_type=`n' if MFGAP>r(r`n')	& ARANK==`rank' & public==`type'
replace total_pct_type=. if missing(MFGAP)

		}
	
	}	
	
	
}


* create a "third" sector, which is everything

expand 2, gen(duplicate)

replace SECTOR=3 if duplicate==1

replace total_pct_type=total_pct if duplicate==1

# delimit ;
label define sector 

1 "Public, 4-year or above" 
2 "Private not-for-profit, 4-year or above" 
3 "Private and Public";
label value SECTOR sector;


# delimit ;
label define paygap 

0 "Negative"
2 "Small" 
4 "Average" 
6 "Large"
8 "Largest";
label value total_pct_type paygap;


cap log close;
log using regression_output.txt, t replace;

foreach outcome in mobility_score pct_pell pct_ftftpell ln_total_cost {;

di "Public";
by ARANK, sort: reg `outcome' MFGAP if SECTOR==1, robust;
by ARANK, sort: sum `outcome' MFGAP if SECTOR==1, detail;

di "Private";
by ARANK, sort: reg `outcome' MFGAP if SECTOR==2, robust;
by ARANK, sort: sum `outcome' MFGAP if SECTOR==2, detail;

*This regression is the focus- a one percentage point increase in "bonus" means "coeffient on MFGAP" more mobility points, percentage point more Pell students, or percent cost (multiplied by 0;
di "Private and Public";
by ARANK, sort: reg `outcome' public MFGAP if SECTOR==3, robust;

*note 10th percentile of MFGAP is negative when examined by rank;
by ARANK, sort: sum `outcome' MFGAP if SECTOR==3, detail;
};

log close;

* sum up mobility rank, total cost, and pell information by pay gap percentile;

collapse (mean) MFGAP mobility_score ln_total_cost pct_pell pct_ftftpell, by(SECTOR ARANK total_pct_type);

replace ln_total_cost=exp(ln_total_cost)/1000;

#delimit ;

local mobility_score_label "Average Social Mobility Score";
local pct_pell_label "% of Undergrads with Pell Grants";
local pct_ftftpell_label "% Undergrads with Pell Grants" "Full Time First Time Students";
local ln_total_cost_label "Average Total Cost (in $1,000)";

* sum up mobility rank, total cost, and pell information by pay gap percentile for graphs - note that the rank-specific percentiles means that ;
* the "pay gaps groups" have different upper and lower bounds, so they are just labeled essentially 'smallest' to 'lowest';

foreach outcome in mobility_score pct_pell  pct_ftftpell ln_total_cost {;
set scheme s1mono;


twoway scatter `outcome' total_pct_type if SECTOR==2 & ARANK==1, msymbol(circle)||
scatter `outcome'  total_pct_type if SECTOR==2 & ARANK==2 , msymbol(circle_hollow) ||
scatter `outcome'  total_pct_type if SECTOR==2 & ARANK==3 , msymbol(X) mcolor(black)
legend( label(3 "Assistant")  label(2 "Associate") label(1 "Full") row(1) region(lstyle(none)))  
xtitle("Male Faculty Bonus, Private Schools")
xlabel(0 2 4 6 8, valuelabel)
ytitle("``outcome'_label'")
saving(`outcome'_private, replace);
graph export `outcome'_private.jpg, as(jpg) replace;

twoway scatter `outcome' total_pct_type if SECTOR==1 & ARANK==1, msymbol(circle)||
scatter `outcome'  total_pct_type if SECTOR==1 & ARANK==2 , msymbol(circle_hollow) ||
scatter `outcome'  total_pct_type if SECTOR==1 & ARANK==3 , msymbol(X) mcolor(black)
legend( label(3 "Assistant")  label(2 "Associate") label(1 "Full") row(1) region(lstyle(none))) 
xtitle("Male Faculty Bonus, State Schools")
xlabel(0 2 4 6 8, valuelabel)
ytitle("``outcome'_label'")
saving(`outcome'_public, replace);
graph export `outcome'_public.jpg, as(jpg) replace;

twoway scatter `outcome' total_pct_type if SECTOR==3 & ARANK==1, msymbol(circle)||
scatter `outcome'  total_pct_type if SECTOR==3 & ARANK==2 , msymbol(circle_hollow) ||
scatter `outcome'  total_pct_type if SECTOR==3 & ARANK==3 , msymbol(X) mcolor(black) 
legend( label(3 "Assistant")  label(2 "Associate") label(1 "Full") row(1) region(lstyle(none)))  
xtitle("Male Faculty Bonus")
xlabel(0 2 4 6 8, valuelabel)
ytitle("``outcome'_label'")
saving(`outcome'_all, replace);
graph export `outcome'_all.jpg, as(jpg) replace;


};

